ephrin_binding¶

Analyze receptor selection data using soluble Ephrin-B2 or -B3 NOTE NEED TO FIX

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
# input configs
altair_config = None
nipah_config = None

# input files
entropy_file = None
func_scores_E2_file = None
binding_E2_file = None
func_scores_E3_file = None
binding_E3_file = None

# output files
filtered_E2_binding_data = None
filtered_E3_binding_data = None
filtered_E2_binding_low_effect = None
filtered_E3_binding_low_effect = None

# output images
entry_binding_combined_corr_plot = None
entry_binding_combined_corr_plot_agg = None
E2_E3_correlation = None
E2_E3_correlation_site = None
combined_E2_E3_site_corr = None
binding_by_site_plot = None
entry_binding_corr_heatmap = None
binding_corr_heatmap = None
binding_region_boxplot_plot = None
binding_region_bubble_plot = None
max_binding_in_stalk = None
max_binding_in_contact = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entropy_file = "results/entropy/entropy.csv"
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
binding_E2_file = "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"
filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
filtered_E2_binding_low_effect = (
    "results/filtered_data/E2_binding_low_effect_filter.csv"
)
filtered_E3_binding_low_effect = (
    "results/filtered_data/E3_binding_low_effect_filter.csv"
)
entry_binding_combined_corr_plot = (
    "results/images/entry_binding_combined_corr_plot.html"
)
entry_binding_combined_corr_plot_agg = (
    "results/images/entry_binding_combined_corr_plot_agg.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
combined_E2_E3_site_corr = "results/images/combined_E2_E3_site_corr.html"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
entry_binding_corr_heatmap = "results/images/entry_binding_corr_heatmap.html"
binding_corr_heatmap = "results/images/binding_corr_heatmap.html"
binding_region_boxplot_plot = "results/images/binding_region_boxplot_plot.html"
binding_region_bubble_plot = "results/images/binding_region_bubble_plot.html"
max_binding_in_contact = "results/images/max_binding_in_contact.html"
max_binding_in_stalk = "results/images/max_binding_in_stalk.html"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if (
    os.getcwd()
    == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
    pass
    print("Already in correct directory")
else:
    os.chdir(
        "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
    )
    print("Setup in correct directory")
Setup in correct directory

hard paths for running in interactive mode¶

In [5]:
if nipah_config is None:
    ##hard paths in case don't want to run with snakemake
    print("loading hard paths")
    altair_config = "data/custom_analyses_data/theme.py"
    nipah_config = "nipah_config.yaml"
    entropy_file = "results/entropy/entropy.csv"

    # input files
    func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
    binding_E2_file = (
        "results/receptor_affinity/averages/EFNB2_monomeric_mut_effect.csv"
    )
    func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
    binding_E3_file = "results/receptor_affinity/averages/EFNB3_dimeric_mut_effect.csv"

    filtered_E2_binding_data = "results/filtered_data/E2_binding_filtered.csv"
    filtered_E3_binding_data = "results/filtered_data/E3_binding_filtered.csv"
    filtered_E2_binding_low_effect = (
        "results/filtered_data/E2_binding_low_effect_filter.csv"
    )
    filtered_E3_binding_low_effect = (
        "results/filtered_data/E3_binding_low_effect_filter.csv"
    )

Run config files to setup altair theme and config variables¶

In [6]:
if altair_config:
    with open(altair_config, "r") as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import the binding and entry data for EFNB2 and EFNB3¶

In [7]:
# import binding and entry data
e2 = pd.read_csv(binding_E2_file)
e2_func = pd.read_csv(func_scores_E2_file)
e3 = pd.read_csv(binding_E3_file)
e3_func = pd.read_csv(func_scores_E3_file)

Filter the data and save¶

In [8]:
def merge_func_binding_dfs(func, binding, name):
    # Merge the 'binding' and 'func' DataFrames on specified columns with outer join.
    # This combines data based on 'site', 'mutant', and 'wildtype' columns.
    # Suffixes are added to columns with identical names in both DataFrames for differentiation.
    # The result is rounded to three decimal places for neatness.
    df_int = pd.merge(
        binding,
        func,
        on=["site", "mutant", "wildtype"],
        suffixes=["_binding", "_cell_entry"],
        validate="one_to_one",
        how="outer",
    ).round(3)

    # Rename specific columns for clarity.
    df = df_int.rename(
        columns={
            "Ephrin binding_mean": "binding_mean",
            "Ephrin binding_std": "binding_std",
            "Ephrin binding_median": "binding_median",
        }
    )

    # Select and reorder a subset of columns for the final DataFrame.
    df = df[
        [
            "site",
            "wildtype",
            "mutant",
            "binding_mean",
            "binding_std",
            "times_seen_binding",
            "effect",
            "effect_std",
            "times_seen_cell_entry",
            "frac_models",
        ]
    ]

    # Define a function to filter the DataFrame based on several criteria.
    def filter_binding_data(df):
        # Apply multiple conditions to filter the data for relevant entries.
        df_filter = df[
            (df["mutant"] != "*")
            & (df["mutant"] != "-")
            & (df["site"] != 603)
            &
            # Conditions related to cell entry effects and observations.
            (df["effect"] >= config["min_func_effect_for_binding"])
            & (df["times_seen_cell_entry"] >= config["func_times_seen_cutoff"])
            & (df["effect_std"] <= config["func_std_cutoff"])
            &
            # Conditions related to binding observations and variability.
            (df["times_seen_binding"] >= config["min_times_seen_binding"])
            & (df["binding_std"] <= config["max_binding_std"])
            & (df["frac_models"] >= config["frac_models"])
        ]
        return df_filter

    # Filter the DataFrame using the defined function.
    df_filter = filter_binding_data(df)

    # Define a function to identify low effect mutants for further analysis.
    def store_filtered_info(df):
        df_low_filter = df[
            (df["mutant"] != "*")
            & (df["mutant"] != "-")
            & (df["site"] != 603)
            & (df["effect"] < config["min_func_effect_for_binding"])
            & (df["times_seen_cell_entry"] >= config["func_times_seen_cutoff"])
            & (df["effect_std"] <= config["func_std_cutoff"])
        ]
        return df_low_filter

    # Filter the DataFrame to identify low effect mutants.
    df_low_effect_filter = store_filtered_info(df)

    # Save the filtered data to CSV files, differentiating between EFNB2 and others.
    if name == "EFNB2":
        print(name)  # Print the name for confirmation.
        df_filter.to_csv(filtered_E2_binding_data, index=False)
        df_low_effect_filter.to_csv(filtered_E2_binding_low_effect, index=False)
    else:
        df_filter.to_csv(filtered_E3_binding_data, index=False)
        df_low_effect_filter.to_csv(filtered_E3_binding_low_effect, index=False)

    # Return the filtered DataFrames for further use.
    return df_filter, df_low_effect_filter

# Use the defined function to filter data for EFNB2 and EFNB3.
df_E2_filter, df_E2_filter_missing = merge_func_binding_dfs(e2_func, e2, "EFNB2")
df_E3_filter, df_E3_filter_missing = merge_func_binding_dfs(e3_func, e3, "EFNB3")

# Merge the filtered EFNB2 and EFNB3 DataFrames for combined analysis.
df_binding_effect_merge = pd.merge(
    df_E2_filter,
    df_E3_filter,
    on=["site", "wildtype", "mutant"],
    suffixes=["_E2", "_E3"],
    how="outer",
)

# Display descriptive statistics of the merged DataFrame.
display(df_binding_effect_merge.describe().round(3))

# Add a 'selection' column to distinguish between EFNB2 and EFNB3 data.
df_E2_filter["selection"] = "EFNB2"
df_E3_filter["selection"] = "EFNB3"

# Concatenate the DataFrames for plotting or further analysis.
df_binding_effect_concat = pd.concat([df_E2_filter, df_E3_filter])
EFNB2
site binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3
count 7319.000 6805.000 6805.000 6805.000 6805.000 6805.000 6805.000 6805.000 6619.000 6619.000 6619.000 6619.000 6619.000 6619.000 6619.0
mean 342.744 -0.315 0.507 6.350 -0.110 0.372 7.631 0.994 -0.018 0.180 6.158 -0.091 0.390 6.733 1.0
std 148.521 1.105 0.340 3.208 0.480 0.195 4.381 0.050 0.272 0.173 3.075 0.447 0.188 3.591 0.0
min 71.000 -5.280 0.005 2.000 -1.498 0.010 2.000 0.500 -2.147 0.000 2.000 -1.499 0.033 2.000 1.0
25% 215.000 -0.448 0.273 4.500 -0.366 0.224 5.000 1.000 -0.150 0.063 4.000 -0.308 0.249 4.571 1.0
50% 341.000 -0.040 0.427 5.750 0.012 0.339 6.750 1.000 -0.012 0.135 5.500 0.029 0.357 6.000 1.0
75% 466.000 0.255 0.651 7.500 0.257 0.485 9.000 1.000 0.118 0.242 7.500 0.236 0.500 7.857 1.0
max 602.000 2.160 2.911 43.750 0.604 0.994 72.380 1.000 2.006 2.957 45.000 0.615 1.000 56.710 1.0
In [9]:
def what_got_filtered(df,name):
    print(f' Filtering stats for {name}:\n')
    print(df.shape[0])
    print(df.dropna().shape[0])
    display(df)
    tmp_df = df[df['effect'] < config['min_func_effect_for_binding']]
    print(tmp_df.shape[0])


what_got_filtered(df_E2_filter_missing,'EFNB2')
what_got_filtered(df_E3_filter_missing,'EFNB2')
 Filtering stats for EFNB2:

2943
2834
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models
1 71 Q C 0.010 0.390 2.75 -1.750 0.178 4.625 1.0
7 71 Q I -1.343 1.191 2.25 -2.171 0.255 4.500 1.0
12 71 Q P 1.075 1.498 3.00 -3.385 0.000 7.750 1.0
17 71 Q W 0.059 1.419 3.75 -2.143 0.406 6.500 1.0
18 71 Q Y 1.497 2.096 2.50 -1.827 0.398 3.429 0.5
... ... ... ... ... ... ... ... ... ... ...
10379 156 H P NaN NaN NaN -2.956 0.000 4.000 NaN
10393 164 N C NaN NaN NaN -2.987 0.000 2.250 NaN
10400 170 E C NaN NaN NaN -2.664 0.827 2.000 NaN
10492 253 G P NaN NaN NaN -1.998 0.915 2.000 NaN
10534 294 L W NaN NaN NaN -2.593 0.692 2.250 NaN

2943 rows × 10 columns

2943
 Filtering stats for EFNB2:

3016
2279
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models
12 71 Q P 0.534 0.627 2.0 -3.367 0.000 6.714 1.0
17 71 Q W -0.462 0.045 3.5 -1.798 0.570 5.286 1.0
18 71 Q Y -0.369 NaN 2.0 -2.087 0.289 3.667 0.5
21 72 N C -0.147 0.254 5.0 -2.373 0.253 6.143 1.0
31 72 N P -0.068 0.261 3.5 -3.496 0.000 7.857 1.0
... ... ... ... ... ... ... ... ... ... ...
10812 557 N S NaN NaN NaN -2.459 0.723 2.857 NaN
10836 573 W P NaN NaN NaN -2.631 0.871 2.000 NaN
10847 581 Y L NaN NaN NaN -3.491 0.000 6.143 NaN
10862 592 L K NaN NaN NaN -3.349 0.000 3.286 NaN
10868 595 V P NaN NaN NaN -2.204 0.655 2.500 NaN

3016 rows × 10 columns

3016
In [10]:
# What are the top 5 highest and lowest binding mutants for EFNB2 and EFNB3?
def find_highest_lowest(df, name):
    #df = df[df['site'].isin(config['contact_sites'])]
    print(f"We are analyzing {name}\n")
    print(f"The total number of mutants was: {df.shape[0]}\n")
    tmp_df = df.sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected:")
    display(tmp_df.head(5))

    tmp_df_high = df.sort_values(by="binding_mean", ascending=False)
    print("\nThese are the highest binding mutants detected:\n")
    display(tmp_df_high.head(5))

    # What about mutants with positive entry scores?
    tmp_df = df[df["effect"] > 0].sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected with positive entry scores:")
    display(tmp_df.head(5))

    tmp_df_high = df[df["effect"] > 0].sort_values(by="binding_mean", ascending=False)
    print(
        "\nThese are the highest binding mutants detected with positive entry scores:\n"
    )
    display(tmp_df_high.head(5))

    mean_df = df.groupby('site')['binding_mean'].sum().reset_index()
    display(mean_df.sort_values(by='binding_mean',ascending=False).head(10))


find_highest_lowest(df_E2_filter, "EFNB2")
find_highest_lowest(df_E3_filter, "EFNB3")
We are analyzing EFNB2

The total number of mutants was: 6805

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
8285 501 E K -5.280 0.928 24.50 -0.644 0.606 32.00 1.0 EFNB2
8898 532 A V -5.259 1.140 16.25 -0.320 0.312 17.88 1.0 EFNB2
3188 236 R H -5.054 0.883 22.00 -1.253 0.335 29.38 1.0 EFNB2
8072 490 Q K -4.966 1.081 8.50 -0.692 0.539 11.38 1.0 EFNB2
8070 490 Q H -4.752 0.862 8.25 -0.450 0.668 10.88 1.0 EFNB2
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
9544 566 F C 2.160 2.119 5.00 -0.533 0.455 5.500 0.75 EFNB2
9830 580 I S 2.152 1.768 4.75 -1.001 0.692 6.375 1.00 EFNB2
5007 331 N E 2.144 1.233 3.00 -0.887 0.844 3.500 1.00 EFNB2
3820 269 N E 2.075 2.241 5.00 -1.048 0.484 7.875 1.00 EFNB2
9948 586 N T 1.962 0.578 5.25 -0.816 0.823 7.000 1.00 EFNB2
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
9402 558 A V -4.647 0.730 17.25 0.118 0.364 18.50 1.0 EFNB2
9369 557 N D -4.627 0.399 9.50 0.385 0.479 11.12 1.0 EFNB2
8773 526 L H -4.546 0.782 5.25 0.110 0.392 6.75 1.0 EFNB2
9853 581 Y W -4.519 1.013 10.75 0.091 0.410 11.50 1.0 EFNB2
8090 491 S H -4.505 0.935 8.50 0.112 0.250 10.25 1.0 EFNB2
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
4080 282 C S 1.609 1.056 4.50 0.338 0.325 3.714 0.50 EFNB2
7303 450 Q I 1.457 1.194 5.00 0.142 0.399 6.000 1.00 EFNB2
5159 339 S A 1.389 2.087 2.00 0.449 0.149 2.625 0.75 EFNB2
5579 361 T A 1.371 2.108 3.50 0.363 0.450 4.875 1.00 EFNB2
1784 164 N S 1.297 1.136 3.75 0.240 0.291 6.125 1.00 EFNB2
site binding_mean
250 331 15.733
473 564 15.087
97 172 13.884
225 306 13.853
493 586 13.370
103 178 10.001
151 227 9.970
90 165 8.747
239 320 8.574
496 589 8.565
We are analyzing EFNB3

The total number of mutants was: 6619

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
9221 555 D K -2.147 1.225 6.0 0.085 0.422 6.714 1.0 EFNB3
9227 555 D R -1.510 0.420 4.0 0.076 0.526 4.143 1.0 EFNB3
8739 530 Q L -1.447 0.477 3.5 -1.019 0.862 5.857 1.0 EFNB3
9761 583 T R -1.410 0.446 5.5 -1.005 0.485 6.286 1.0 EFNB3
9755 583 T K -1.247 0.113 5.0 -0.728 0.644 7.000 1.0 EFNB3
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
9866 589 R G 2.006 0.659 7.0 -0.966 0.576 8.286 1.0 EFNB3
9699 580 I M 1.337 0.534 5.5 -0.816 0.531 6.714 1.0 EFNB3
3777 270 V Q 1.329 1.028 5.0 -0.885 0.651 5.429 1.0 EFNB3
3719 267 M Q 1.291 1.371 4.5 -0.798 0.818 5.667 1.0 EFNB3
8010 492 Q L 1.261 0.067 5.5 0.483 0.183 5.143 1.0 EFNB3
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
9221 555 D K -2.147 1.225 6.0 0.085 0.422 6.714 1.0 EFNB3
9227 555 D R -1.510 0.420 4.0 0.076 0.526 4.143 1.0 EFNB3
8747 530 Q W -0.997 0.547 3.0 0.018 0.665 3.286 1.0 EFNB3
9214 555 D A -0.830 0.200 3.5 0.376 0.234 3.714 1.0 EFNB3
7038 441 P V -0.746 0.182 5.5 0.371 0.086 5.286 1.0 EFNB3
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models selection
8010 492 Q L 1.261 0.067 5.5 0.483 0.183 5.143 1.0 EFNB3
2637 211 G F 1.169 0.289 5.0 0.164 0.572 5.714 1.0 EFNB3
10031 598 P G 1.154 1.559 4.5 0.239 0.515 5.429 1.0 EFNB3
9192 553 S W 0.994 0.435 9.5 0.167 0.406 9.857 1.0 EFNB3
9745 582 D W 0.976 0.338 6.5 0.063 0.310 7.143 1.0 EFNB3
site binding_mean
490 589 11.707
87 161 7.333
469 564 6.722
225 306 6.494
59 132 6.170
242 323 6.104
470 566 5.969
261 342 5.732
129 204 5.653
175 254 5.152

Find the top overlapping binders for both EFNB2 and EFNB3¶

In [11]:
def find_good_binding_for_both(df):
    e2_good = df.sort_values(by='binding_mean_E2',ascending=False)
    e3_good = df.sort_values(by='binding_mean_E3',ascending=False)
    e2_good = e2_good.head(20)
    e3_good = e3_good.head(20)
    combo = pd.merge(e2_good,e3_good,on=['site','wildtype','mutant'],how='inner')
    display(combo)

find_good_binding_for_both(df_binding_effect_merge)
site wildtype mutant binding_mean_E2_x binding_std_E2_x times_seen_binding_E2_x effect_E2_x effect_std_E2_x times_seen_cell_entry_E2_x frac_models_E2_x ... effect_std_E2_y times_seen_cell_entry_E2_y frac_models_E2_y binding_mean_E3_y binding_std_E3_y times_seen_binding_E3_y effect_E3_y effect_std_E3_y times_seen_cell_entry_E3_y frac_models_E3_y
0 566 F C 2.160 2.119 5.00 -0.533 0.455 5.500 0.75 ... 0.455 5.500 0.75 1.132 1.194 3.5 -0.851 0.760 3.833 1.0
1 211 G F 1.957 0.549 4.75 -0.665 0.426 5.125 1.00 ... 0.426 5.125 1.00 1.169 0.289 5.0 0.164 0.572 5.714 1.0

2 rows × 31 columns

In [12]:
# Compare E2 and E3 binders
def find_highest_lowest(df):
    df["binding_diff"] = (df["binding_mean_E2"] - df["binding_mean_E3"]).abs()
    print(
        "These are the mutants with the biggest difference between EFNB2 and EFNB3:\n"
    )
    display(df.sort_values(by="binding_diff", ascending=False).head(10))

    # calculate aggregate differences
    agg_df = (
        df.groupby("site")[["binding_mean_E2", "binding_mean_E3", "binding_diff"]]
        .mean()
        .reset_index()
    )
    print("These are the sites with the biggest difference between EFNB2 and EFNB3:\n")
    display(agg_df.sort_values(by="binding_diff", ascending=False).head(5))


find_highest_lowest(df_binding_effect_merge)
# find_highest_lowest(df_E3_filter,'EFNB3')
These are the mutants with the biggest difference between EFNB2 and EFNB3:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 binding_diff
5836 530 Q F -4.015 0.466 6.25 0.467 0.202 5.375 1.0 0.084 0.062 6.0 0.359 0.274 6.286 1.0 4.099
5363 490 Q Y -4.290 0.900 5.50 -1.088 0.400 6.375 1.0 -0.461 0.574 5.5 -0.314 0.797 5.714 1.0 3.829
5368 491 S H -4.505 0.935 8.50 0.112 0.250 10.250 1.0 -0.872 0.790 7.0 -1.381 0.486 8.286 1.0 3.633
5362 490 Q W -4.205 1.005 3.75 0.235 0.378 4.750 1.0 -0.586 0.323 3.0 -0.151 0.458 3.286 1.0 3.619
5367 491 S G -4.181 0.595 6.50 0.130 0.298 8.625 1.0 -0.562 0.047 5.5 -0.236 0.608 6.286 1.0 3.619
6628 588 I P -2.349 0.840 5.25 -0.256 0.300 4.750 1.0 1.183 0.255 5.5 -0.613 0.382 5.429 1.0 3.532
5386 492 Q K -3.523 0.549 11.00 0.353 0.232 11.620 1.0 0.006 0.077 10.0 0.329 0.240 11.570 1.0 3.529
5390 492 Q R -2.823 0.611 19.25 -0.077 0.272 23.120 1.0 0.644 0.054 20.0 0.509 0.119 20.570 1.0 3.467
1865 239 S H -3.440 0.088 5.25 0.334 0.272 4.500 1.0 -0.041 0.832 4.0 -1.219 0.693 5.000 1.0 3.399
5845 530 Q V -4.327 0.838 6.00 0.106 0.269 8.625 1.0 -0.987 0.896 5.5 -1.443 0.739 7.000 1.0 3.340
These are the sites with the biggest difference between EFNB2 and EFNB3:

site binding_mean_E2 binding_mean_E3 binding_diff
416 505 -3.790625 -0.771000 2.897000
440 530 -3.631600 -0.559250 2.797750
406 491 -3.575429 -0.364667 2.785167
496 588 -3.474250 0.037333 2.686333
408 494 -3.915765 -0.615000 2.411333

Make plots showing correlation between binding and entry for EFNB2 and EFNB3¶

In [13]:
def plot_corr_binding_entry_updated(df, flag):
    # Define interactive selectors for variant selection.
    # 'variant_selector' is for selecting individual variants based on 'site' and 'mutant'.
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
    )
    # 'variant_selector_agg' is for aggregated selection based on 'site' only.
    variant_selector_agg = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )

    # Initialize an empty list to store charts for each unique selection in 'df'.
    empty_chart = []
    
    # Loop through each unique cell selection in the DataFrame.
    for cell in list(df["selection"].unique()):
        # Filter DataFrame for the current selection.
        tmp_df = df[df["selection"] == cell]
        
        # Check if aggregation flag is True to aggregate data.
        if flag == True:
            # Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
            agg_df = (
                tmp_df.groupby("site")[["binding_mean", "effect"]].sum().reset_index()
            )
            # Create a chart using aggregated data with specified encodings.
            chart = (
                alt.Chart(agg_df)
                .mark_point(stroke="black", filled=True)  # Use filled points with black stroke.
                .encode(
                    x=alt.X(
                        "effect",
                        title=f"Summed {cell} Cell Entry",
                        axis=alt.Axis(grid=True),
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"Summed {cell} Binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
                    opacity=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0.2)
                    ),
                    size=alt.condition(
                        variant_selector_agg, alt.value(100), alt.value(50)
                    ),
                    strokeWidth=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector_agg, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=["site", "binding_mean", "effect"],
                )
                .add_params(variant_selector_agg)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

        else:
            # Create a chart for individual data points with specified encodings.
            chart = (
                alt.Chart(tmp_df)
                .mark_point(stroke="black", filled=True)
                .encode(
                    x=alt.X(
                        "effect", title=f"{cell} Cell Entry", axis=alt.Axis(grid=True)
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"{cell} Binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
                    opacity=alt.condition(
                        variant_selector, alt.value(1), alt.value(0.1)
                    ),
                    size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
                    strokeWidth=alt.condition(
                        variant_selector, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=[
                        "site",
                        "wildtype",
                        "mutant",
                        "binding_mean",
                        "times_seen_binding",
                        "effect",
                    ],
                )
                .add_params(variant_selector)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

    # Combine all charts horizontally with a title.
    combined_chart = alt.hconcat(
        *empty_chart, title=alt.Title("Correlation between binding and entry")
    )
    
    # Return the combined chart for display or further use.
    return combined_chart

# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()

# Save the plot 
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_plot.save(entry_binding_combined_corr_plot)

# Repeat for aggregated data.
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat, True)
entry_binding_corr_plot_agg.display()

# Save the aggregated plot
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_plot_agg.save(entry_binding_combined_corr_plot_agg)
In [14]:
def plot_corr_binding_entry_updated(df, flag):
    # Define interactive selectors for variant selection.
    # 'variant_selector' is for selecting individual variants based on 'site' and 'mutant'.
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
    )
    # 'variant_selector_agg' is for aggregated selection based on 'site' only.
    variant_selector_agg = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )

    # Initialize an empty list to store charts for each unique selection in 'df'.
    empty_chart = []
    
    # Loop through each unique cell selection in the DataFrame.
    for cell in list(df["selection"].unique()):
        # Filter DataFrame for the current selection.
        tmp_df = df[df["selection"] == cell]
        
        # Check if aggregation flag is True to aggregate data.
        if flag == True:
            # Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
            agg_df = (
                tmp_df.groupby("site")[["binding_mean", "effect"]].sum().reset_index()
            )
            # Create a chart using aggregated data with specified encodings.
            chart = (
                alt.Chart(agg_df)
                .mark_point(stroke="black", filled=True)  # Use filled points with black stroke.
                .encode(
                    x=alt.X(
                        "effect",
                        title=f"Summed {cell} Cell Entry",
                        axis=alt.Axis(grid=True),
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"Summed {cell} Binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
                    opacity=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0.2)
                    ),
                    size=alt.condition(
                        variant_selector_agg, alt.value(100), alt.value(50)
                    ),
                    strokeWidth=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector_agg, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=["site", "binding_mean", "effect"],
                )
                .add_params(variant_selector_agg)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

        else:
            # Create a chart for individual data points with specified encodings.
            chart = (
                alt.Chart(tmp_df)
                .mark_point(stroke="black", filled=True)
                .encode(
                    x=alt.X(
                        "effect", title=f"{cell} Cell Entry", axis=alt.Axis(grid=True)
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"{cell} Binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
                    opacity=alt.condition(
                        variant_selector, alt.value(1), alt.value(0.1)
                    ),
                    size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
                    strokeWidth=alt.condition(
                        variant_selector, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=[
                        "site",
                        "wildtype",
                        "mutant",
                        "binding_mean",
                        "times_seen_binding",
                        "effect",
                    ],
                )
                .add_params(variant_selector)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

    # Combine all charts horizontally with a title.
    combined_chart = alt.hconcat(
        *empty_chart, title=alt.Title("Correlation between binding and entry")
    )
    
    # Return the combined chart for display or further use.
    return combined_chart

# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()

Same plot as above, but slightly different format¶

In [15]:
def plot_entry_binding_corr_heatmap(df):
    empty_chart = []

    for cell in list(df["selection"].unique()):
        tmp_df = df[df["selection"] == cell]
        chart = (
            alt.Chart(tmp_df, title=f"{cell}")
            .mark_rect()
            .encode(
                x=alt.X(
                    "effect", title="Cell Entry", axis=alt.Axis(values=[-2, -1, 0, 1])
                ).bin(maxbins=60),
                y=alt.Y(
                    "binding_mean",
                    title="Binding",
                    axis=alt.Axis(values=[-4, -2, 0, 2]),
                ).bin(maxbins=60),
                color=alt.Color("count()", title="Count").scale(scheme="greenblue"),
            )
        )
        empty_chart.append(chart)

    combined_chart = alt.hconcat(
        *empty_chart, title=alt.Title("Correlation between binding and entry")
    ).resolve_scale(y="shared", x="shared", color="shared")
    return combined_chart


entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_heat.save(entry_binding_corr_heatmap)

Calculate some stats on binding¶

In [16]:
def overall_stats(df, effect, name):
    # Now group sites and find sites where all mutants are deleterious
    filtered_df = df.groupby("site").filter(lambda group: (group[effect] < -0.25).all())
    # Which sites are these?
    unique = filtered_df["site"].unique()
    # Convert unique to a Pandas Series
    unique_series = pd.Series(unique)

    # Find the common elements that are also contact sites
    unique_contact_bool = unique_series.isin(config["contact_sites"])
    # Filter and get the common elements
    common_elements = unique_series[unique_contact_bool]

    print(f"The dataset we are analyzing is: {name}\n")

    # Print the common elements
    print(
        f"Here are the contact sites that only have negative binding scores: {list(common_elements)}\n"
    )

    print(f"There are {len(unique)} sites with all negative binding score mutants\n")
    print(
        f"These are the sites with all negative binding score mutants: {list(unique)}\n"
    )

    # Now find sites with low and high binding (median)
    median_df = (
        df.groupby("site")["binding_mean"]
        .max()
        .reset_index()
        .sort_values(by="binding_mean", ascending=False)
    )
    print("These are the sites with the highest binding mutants:\n")
    display(median_df.head(5))

    # Now calculate mutant number
    total_mutants = df.shape[0]

    mutants_above_cutoff_tolerated = df[df["effect"] > 0]
    mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[
        ["site", "effect", "binding_mean", "wildtype", "mutant"]
    ]

    total_sites = df["site"].unique().shape[0]

    print(f"The total number of sites are: {total_sites}")


overall_stats(df_E2_filter, "binding_mean", "EFNB2")
overall_stats(df_E3_filter, "binding_mean", "EFNB3")
The dataset we are analyzing is: EFNB2

Here are the contact sites that only have negative binding scores: [242, 389, 488, 490, 491, 501, 504, 505, 531, 532, 533, 557, 579, 581, 588]

There are 41 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [100, 116, 220, 236, 238, 242, 243, 248, 346, 351, 352, 389, 390, 400, 435, 438, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590]

These are the sites with the highest binding mutants:

site binding_mean
474 566 2.160
487 580 2.152
250 331 2.144
188 269 2.075
493 586 1.962
The total number of sites are: 510
The dataset we are analyzing is: EFNB3

Here are the contact sites that only have negative binding scores: [389, 488, 501, 505, 531, 532]

There are 15 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [108, 352, 389, 467, 486, 488, 494, 495, 497, 501, 505, 510, 531, 532, 584]

These are the sites with the highest binding mutants:

site binding_mean
490 589 2.006
482 580 1.337
189 270 1.329
186 267 1.291
405 492 1.261
The total number of sites are: 504

Find sites with opposite effects on binding¶

In [17]:
# find sites that are different
def find_biggest_differences(df):
    efnb2_good_efnb3_bad = df[
        (df["binding_mean_E2"] > 0.5) & (df["binding_mean_E3"] < -0.5)
    ].sort_values(by="binding_mean_E2", ascending=False)
    display(efnb2_good_efnb3_bad)

    efnb2_bad_efnb3_good = df[
        (df["binding_mean_E2"] < -0.5) & (df["binding_mean_E3"] > 0.5)
    ].sort_values(by="binding_mean_E3", ascending=False)
    display(efnb2_bad_efnb3_good)


find_biggest_differences(df_binding_effect_merge)
site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 binding_diff
6042 546 L H 1.523 1.145 5.25 -1.148 0.387 6.750 1.0 -0.543 0.665 3.5 -0.851 0.544 5.143 1.0 2.066
2621 303 P C 1.345 1.577 5.00 -0.718 0.590 5.000 1.0 -0.566 0.098 4.0 -0.697 0.594 4.857 1.0 1.911
122 79 N S 1.292 1.652 3.00 -0.635 0.496 2.375 1.0 -0.646 0.260 3.0 -0.129 0.569 3.429 1.0 1.938
103 78 D I 0.659 0.634 5.25 -0.931 0.398 6.500 1.0 -0.566 0.755 5.0 -0.517 0.845 5.143 1.0 1.225
1682 224 M Q 0.647 0.880 3.50 0.166 0.315 2.750 1.0 -0.581 0.381 3.5 0.206 0.292 4.286 1.0 1.228
5708 520 I G 0.591 1.013 5.00 -1.465 0.349 8.000 1.0 -0.530 0.148 5.5 -1.309 0.319 7.000 1.0 1.121
1032 178 V T 0.529 0.726 3.75 -0.528 0.759 4.250 1.0 -0.513 0.600 3.0 -0.019 0.425 3.571 1.0 1.042
site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 binding_diff
6628 588 I P -2.349 0.840 5.25 -0.256 0.300 4.750 1.0 1.183 0.255 5.5 -0.613 0.382 5.429 1.0 3.532
6734 598 P G -0.552 0.785 5.00 -0.867 0.939 4.625 1.0 1.154 1.559 4.5 0.239 0.515 5.429 1.0 1.706
5237 479 W S -0.802 1.272 3.25 -1.304 0.878 5.250 1.0 0.731 2.957 2.0 -1.469 0.612 3.286 1.0 1.533
3180 338 R G -0.517 0.507 4.50 0.300 0.422 5.250 1.0 0.684 0.976 4.0 0.158 0.340 3.429 1.0 1.201
5364 491 S A -0.932 0.537 5.50 -0.669 0.806 6.250 1.0 0.645 0.010 6.5 0.143 0.358 6.143 1.0 1.577
5390 492 Q R -2.823 0.611 19.25 -0.077 0.272 23.120 1.0 0.644 0.054 20.0 0.509 0.119 20.570 1.0 3.467

Find correlations between EFNB2 and EFNB3 binding¶

In [18]:
def plot_entry_binding_corr(df):
    chart = (
        alt.Chart(df, title="Correlation Between Mutant Binding Scores")
        .mark_rect()
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title="EFNB2 binding",
                axis=alt.Axis(values=[-5, 0, 2]),
            ).bin(maxbins=40),
            y=alt.Y(
                "binding_mean_E3",
                title="EFNB3 binding",
                axis=alt.Axis(values=[-2, 0, 2]),
            ).bin(maxbins=40),
            color=alt.Color("count()", title="Count").scale(scheme="greenblue"),
        )
    )
    return chart


entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_heatmap_1.save(binding_corr_heatmap)
In [19]:
def plot_affinity_solid(df):
    df = df.dropna()
    # calculate r value
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        df["binding_mean_E2"], df["binding_mean_E3"]
    )
    r_value = float(r_value)
    # make chart
    chart = (
        alt.Chart(
            df,
            title=alt.Title(
                "Correlation between Mutant Binding Scores", subtitle=f"r={r_value:.2f}"
            ),
        )
        .mark_point(color="black", size=30, opacity=0.2, filled=True)
        .encode(
            x=alt.X("binding_mean_E2", title=("EFNB2 Binding")),
            y=alt.Y("binding_mean_E3", title=("EFNB3 Binding")),
            tooltip=[
                "site",
                "wildtype",
                "mutant",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
    )
    highlight = chart.transform_filter(
        (alt.datum.site == 458) & (alt.datum.mutant == 'Y') | # Example condition
        (alt.datum.site == 507) & (alt.datum.mutant == 'I') #|
        #(alt.datum.site == 589) & (alt.datum.mutant == 'G') |
        #(alt.datum.site == 580) & (alt.datum.mutant == 'S') | 
        #(alt.datum.site == 492) & (alt.datum.mutant == 'L') |
        #(alt.datum.site == 492) & (alt.datum.mutant == 'R') |
        #(alt.datum.site == 566) & (alt.datum.mutant == 'C') |
        #(alt.datum.site == 211) & (alt.datum.mutant == 'F') |
        #(alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        #(alt.datum.site == 546) & (alt.datum.mutant == 'H') |
        #(alt.datum.site == 555) & (alt.datum.mutant == 'K') 

        


    ).mark_point(
        color="black", size=40, opacity=1, filled=True, stroke='orange', strokeWidth=1
    )
    min = int(df["binding_mean_E2"].min())
    max = int(df["binding_mean_E3"].max())
    text = (
        alt.Chart({"values": [{"x": min, "y": max, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=-10, dy=-20)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    chart_and_text = chart + highlight
    return chart_and_text


E2_E3_corr = plot_affinity_solid(df_binding_effect_merge)
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_corr.save(E2_E3_correlation)

Plot correlations between summary statistics for each site¶

In [20]:
def plot_affinity_solid_mean(df):
    df = df.dropna()
    means = (
        df.groupby("site")
        .agg(
            {
                "effect_E2": "mean",
                "effect_E3": "mean",
                "binding_mean_E2": "mean",
                "binding_mean_E3": "mean",
                "wildtype": "first",
            }
        )
        .reset_index()
    )
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        means["binding_mean_E2"], means["binding_mean_E3"]
    )
    r_value = float(r_value)
    chart = (
        alt.Chart(
            means,
            title=alt.Title(
                "Correlation between Aggregate Mutant Binding Scores",
                subtitle=f"r={r_value:.2f}",
            ),
        )
        .mark_point(size=50, opacity=0.3)
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title=("Avg EFNB2 Binding"),
                axis=alt.Axis(tickCount=3),
            ),
            y=alt.Y(
                "binding_mean_E3",
                title=("Avg EFNB3 Binding"),
                axis=alt.Axis(tickCount=3),
            ),
            tooltip=[
                "site",
                "wildtype",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
    )
    text = (
        alt.Chart({"values": [{"x": -3.5, "y": 0.5, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=0, dy=0)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    chart_and_text = chart
    return chart_and_text


E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_site_corr.save(E2_E3_correlation_site)

if entry_binding_combined_corr_plot is not None:
    (E2_E3_corr | E2_E3_site_corr).save(combined_E2_E3_site_corr)

Make plot showing binding by site (median)¶

In [21]:
def plot_affinity_by_site_median(df):
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )
    empty_charts = []
    for selection in ["binding_mean_E2", "binding_mean_E3"]:
        if selection == "binding_mean_E2":
            name = "EFNB2 Binding"
        else:
            name = "EFNB3 Binding"
        mean = df.groupby("site")[selection].mean().reset_index()
        mean = mean[mean[selection] >= 0]
        chart = (
            alt.Chart(mean)
            .mark_point(stroke="black", filled=True, size=50)
            .encode(
                x=alt.X(
                    "site",
                    title=("Site"),
                    axis=alt.Axis(grid=True, tickCount=4),
                    scale=alt.Scale(domain=[70, 602]),
                ),
                y=alt.Y(selection, title=(name), axis=alt.Axis(grid=True, tickCount=3)),
                tooltip=["site"],
                color=alt.condition(
                    variant_selector, alt.value("orange"), alt.value("black")
                ),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.5)),
                strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
            )
            .properties(height=150, width=500)
            .add_params(variant_selector)
        )
        empty_charts.append(chart)
    combined_chart = alt.vconcat(*empty_charts, spacing=1, title="Max Binding by Site")
    return combined_chart


binding_by_site = plot_affinity_by_site_median(df_binding_effect_merge)
binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
    binding_by_site.save(binding_by_site_plot)
In [22]:
def plot_affinity_by_contact_site(df, sites_to_show, title_text):
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )
    empty_charts = []

    contact_df = df[df["site"].isin(sites_to_show)]
    sites = list(contact_df["site"].unique())

    for selection in df["selection"].unique():
        tmp_df = contact_df[contact_df["selection"] == selection]
        mean = tmp_df.groupby("site")["binding_mean"].max().reset_index()

        chart = (
            alt.Chart(mean)
            .mark_point(size=100)
            .encode(
                x=alt.X(
                    "site:O",
                    sort=sites,
                    title=("Site"),
                    axis=alt.Axis(grid=True, labelAngle=-90),
                    scale=alt.Scale(domain=sites),
                ),
                y=alt.Y(
                    "binding_mean", title=(f"{selection}"), axis=alt.Axis(grid=True)
                ),
                tooltip=["site"],
                color=alt.condition(
                    variant_selector, alt.value("orange"), alt.value("black")
                ),
                strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
            )
            .add_params(variant_selector)
        )
        empty_charts.append(chart)
    combined_chart = alt.vconcat(*empty_charts, spacing=1, title=title_text)
    return combined_chart


contact_binding_by_site = plot_affinity_by_contact_site(
    df_binding_effect_concat, config["contact_sites"], "Max Binding in Contact"
)
contact_binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
    contact_binding_by_site.save(max_binding_in_contact)

contact_binding_by_site_stalk = plot_affinity_by_contact_site(
    df_binding_effect_concat, list(range(96, 147)), "Max Binding in Stalk"
)
contact_binding_by_site_stalk.display()
if entry_binding_combined_corr_plot is not None:
    contact_binding_by_site_stalk.save(max_binding_in_stalk)

Make bubble plots for binding in different areas of receptor pocket¶

In [23]:
def make_boxplot_binding_region(
    df, title
):  # Create a box plot using Altair for aggregated means
    barrel_ranges = {
        "Hydrophobic": config["hydrophobic"],
        "Salt Bridges": config["salt_bridges"],
        "Hydrogen Bonds": config["h_bond_total"],
        "Contact": config["contact_sites"],
        "Overall": list(range(71, 602)),
    }

    mean_df = df.groupby("site")[["binding_mean"]].median().reset_index()
    custom_order = [
        "Hydrophobic",
        "Salt Bridges",
        "Hydrogen Bonds",
        "Contact",
        "Overall",
    ]
    agg_means = []

    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = mean_df[mean_df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"barrel": barrel, "effect": row["binding_mean"], "site": row["site"]}
            )
        agg_means_df = pd.DataFrame(agg_means)
    chart = (
        alt.Chart(agg_means_df)
        .mark_point(size=50, opacity=0.4)
        .encode(
            x=alt.X(
                "barrel:O", sort=custom_order, title=None, axis=alt.Axis(labelAngle=-90)
            ),
            y=alt.Y(
                "effect",
                title=f"Median {title} Binding",
                axis=alt.Axis(grid=True, tickCount=4),
            ),
            xOffset="random:Q",
            tooltip=["barrel", "effect", "site"],
        )
        .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
    )

    return chart.display()


make_boxplot_binding_region(df_E2_filter, "EFNB2")
make_boxplot_binding_region(df_E3_filter, "EFNB3")

make boxplot of binding scores by region¶

In [24]:
def make_boxplot_binding_region(df):
    barrel_ranges = {
        "Stalk": list(range(96, 147)),
        "Neck": list(range(148, 165)),
        "Linker": list(range(166, 177)),
        "Head": list(range(178, 602)),
        "Receptor Contact": config["contact_sites"],
        "Total": list(range(71, 602)),
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
    empty_charts = []
    for selection in df["selection"].unique():
        tmp_df = df[df["selection"] == selection]
        agg_means = []

        # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
        for barrel, sites in barrel_ranges.items():
            subset = tmp_df[tmp_df["site"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {
                        "region": barrel,
                        "binding_mean": row["binding_mean"],
                        "site": row["site"],
                    }
                )
            agg_means_df = pd.DataFrame(agg_means)

        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_boxplot(color="darkgray", extent="min-max", opacity=1)
            .encode(
                x=alt.X(
                    "region:O",
                    sort=custom_order,
                    title="RBP Region",
                    axis=alt.Axis(labelAngle=-90),
                ),
                y=alt.Y(
                    "binding_mean",
                    title=f"Binding",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                tooltip=["region", "binding_mean", "site"],
            )
            .properties(width=config["width"], height=config["height"])
        )
        empty_charts.append(chart)
    combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
        y="shared", x="shared", color="independent"
    )
    return combined_effect_chart


entry_region_boxplot = make_boxplot_binding_region(df_binding_effect_concat)
entry_region_boxplot.display()
if entry_binding_combined_corr_plot is not None:
    entry_region_boxplot.save(binding_region_boxplot_plot)
In [25]:
def make_bubble_binding_region(df):
    barrel_ranges = {
        "Stalk": list(range(96, 147)),
        "Neck": list(range(148, 165)),
        "Linker": list(range(166, 177)),
        "Head": list(range(178, 602)),
        "Receptor Contact": config["contact_sites"],
        "Total": list(range(71, 602)),
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
    empty_charts = []
    for selection in df["selection"].unique():
        tmp_df = df[df["selection"] == selection]
        agg_means = []

        # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
        for barrel, sites in barrel_ranges.items():
            subset = tmp_df[tmp_df["site"].isin(sites)]
            for _, row in subset.iterrows():
                agg_means.append(
                    {
                        "region": barrel,
                        "binding_mean": row["binding_mean"],
                        "site": row["site"],
                        "mutant": row["mutant"],
                    }
                )
            agg_means_df = pd.DataFrame(agg_means)

        variant_selector = alt.selection_point(
            on="mouseover", empty=False, fields=["site", "mutant"], value=1
        )

        chart = (
            alt.Chart(agg_means_df, title=f"{selection}")
            .mark_point(opacity=0.3, stroke="black")
            .encode(
                x=alt.X(
                    "region:O",
                    sort=custom_order,
                    title="RBP Region",
                    axis=alt.Axis(labelAngle=-90),
                ),
                y=alt.Y(
                    "binding_mean",
                    title=f"Binding",
                    axis=alt.Axis(grid=True, tickCount=4),
                ),
                xOffset="random:Q",
                tooltip=["region", "binding_mean", "site", "mutant"],
                color=alt.condition(
                    variant_selector, alt.value("orange"), alt.value("black")
                ),
                opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
                strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
                size=alt.condition(variant_selector, alt.value(50), alt.value(15)),
            )
            .transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
            .properties(width=config["width"], height=config["height"])
        ).add_params(variant_selector)
        empty_charts.append(chart)
    combined_effect_chart = (
        alt.hconcat(*empty_charts)
        .resolve_scale(y="shared", x="shared", color="independent")
        .add_params(variant_selector)
    )
    return combined_effect_chart


entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()
if entry_binding_combined_corr_plot is not None:
    entry_region_bubble.save(binding_region_bubble_plot)
In [ ]: